In [1]:
from IPython.display import HTML

HTML('''
<style>
div.prompt {display:none}
div.input {display:none}
div.output_stderr {display:none}
</style>
<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 $('div.prompt').hide();
 $('div.output_stderr').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"></form>''')
Out[1]:

Analysis of store sell data

In [2]:
import csv
import numpy as np
import pandas as pd
from scipy import stats
import math
import io
import requests
from io import StringIO
from datetime import datetime
from IPython.display import display
%matplotlib inline
import matplotlib.pyplot as plt
In [3]:
df = pd.read_csv('musedb.csv',sep=',',error_bad_lines=False,warn_bad_lines=False)

Number of products sold

In [4]:
df["BILLNO"].count()
Out[4]:
670763

Number of Bills generated

In [5]:
df["BILLNO"].nunique()
Out[5]:
49134

Approximately average product selling frequency on each bill is....

In [6]:
df["BILLNO"].count()/df["BILLNO"].nunique()
Out[6]:
13
In [7]:
import cufflinks as cf
cf.set_config_file(offline=True)
import plotly as py
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot,offline
import plotly.graph_objs as go
In [8]:
vl_reg=df["STOREID"].value_counts()
reg=pd.DataFrame({'labels':vl_reg.index.tolist(), 'values':vl_reg.values.tolist()})
res=reg[reg["values"]>6]
samp=res.iplot(kind='pie',labels='labels',values='values',title="No of product sold at different store")
In [9]:
brand=df["BRAND_DESC"].value_counts()
l=brand.sum()
reg=pd.DataFrame({'labels':brand.index.tolist(), 'values':brand.values.tolist()})
res=reg[reg["values"]>(l*0.008)]
samp=res.iplot(kind='pie',labels='labels',values='values',title="No of product sold of different brand")
In [10]:
df["UPDATE_STAMP"]=pd.to_datetime(df["UPDATE_STAMP"], format='%Y-%m-%d %H:%M')
In [11]:
df["up_mon"]=df["UPDATE_STAMP"].map(lambda x: x.strftime('%Y-%m'))
In [12]:
cat=df["CATEGORY_DESC"].value_counts()
l=float(cat.sum())
cat=cat[cat>(l*0.005)]
df_cat= df[(df["CATEGORY_DESC"]!="Others") & (df["CATEGORY_DESC"]!="OTHERS") & (df["CATEGORY_DESC"]!="Unknown")]
df_cat=df_cat[df_cat.CATEGORY_DESC.isin(cat.index.tolist())]
result = df_cat.pivot_table(index='CATEGORY_DESC', columns="up_mon", values="BILLNO",aggfunc="count")
iplot(result.iplot(kind='bar', barmode="stack", asFigure=True,
                         title="Count of diferent categories sold in each month", 
                         yTitle="Count"))
In [13]:
sub=df["SUBCATEGORY_DESC"].value_counts()
l=float(sub.sum())
sub=sub[sub>(l*0.005)]
df_sub=df[df.SUBCATEGORY_DESC.isin(sub.index.tolist())]
result = df_sub.pivot_table(index='SUBCATEGORY_DESC', columns="up_mon", values="BILLNO",aggfunc="count")
iplot(result.iplot(kind='bar', barmode="stack", asFigure=True,
                         title="Count of diferent subcategories sold in each month", 
                         yTitle="Count",
                         margin=dict(l=40,r=20,b=210,t=40)))
In [14]:
result = df.pivot_table( columns="up_mon", values="BILLNO",aggfunc="count")
iplot(result.T.iplot(kind='bar', barmode='stack', asFigure=True,
                         title="Frequency of product sold in each month", 
                         xTitle="Month year", 
                         yTitle="No of product"))
In [15]:
result = df_sub.pivot_table(index='STOREID', columns="up_mon", values="BILLNO",aggfunc="count")
iplot(result.T.iplot(kind='bar', barmode="stack", asFigure=True,
                         title="Most frequently used store in each month", 
                         yTitle="Count"))
In [16]:
iplot(df.drop_duplicates(subset=["BILLNO"]).pivot_table(columns="up_mon", values="BILLNO",aggfunc="count").T.iplot(kind='bar', barmode="stack", asFigure=True,
                         title="No of bills generated on each month", 
                         yTitle="Count of bills"))